Build and Assess Machine learning Models for PredictionsΒΆ

In [1]:
# Load common libraries
import matplotlib as plt # visualize data graphically
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd # To easily grap and transform data
from scipy import stats # To interpret output
#import statsmodels.api as sm

# Load revised demosgrahic data with the age grouping and amount grouping

# path = r"C:\USers\cthian972\Documents\data_anlaytics_class\class_exercise\Class2\demorgraphic_data_age_amount_grouping.csv"
# data = pd.read_csv(path)
data = pd.read_csv('demorgraphic_data_age_amount_grouping.csv')
df = pd.DataFrame(data)
df
Out[1]:
instore age items amt region amt_g1 amt_g2 amt_gtext age_g1 age_g2 age_gtext age_g3
0 0 37 4 281.03 2 5 5.0 5 3 2 2 1
1 0 35 2 219.51 2 5 2.5 5 3 2 2 1
2 1 45 3 1525.70 4 20 20.0 30 4 2 2 1
3 1 46 3 715.25 3 10 10.0 10 4 2 2 1
4 1 33 4 1937.50 1 20 20.0 30 2 2 2 1
... ... ... ... ... ... ... ... ... ... ... ... ...
79995 1 71 3 558.82 1 10 10.0 10 6 4 3 2
79996 0 59 7 1932.00 3 20 20.0 30 5 3 3 2
79997 0 54 1 414.16 2 5 5.0 5 4 3 2 1
79998 1 49 4 335.32 1 5 5.0 5 4 2 2 1
79999 1 30 1 527.12 3 10 10.0 10 2 2 2 1

80000 rows Γ— 12 columns

In [2]:
# check data types, in this case everything looks fine no changes need to be made. 
df.info()

# note: age_g1 is age grouping: age<=24 is 1, 24<age<=34 is 2, 34<age<=44 is 3, 44<age<=54 is 4, 54<age<=64 is 5,
#                               64<age<=74 is 6, age>74 is 7
#       age_g2 is age grouping: age<=24 is 1, 24<age<=49 is 2, 50<age<=65 is 3, age>65 is 4
#       age_gtext is age grouping: age<=27 is 1, 27<age<=55 is 2, age>55 is 3
#       age_g3  is age grouping: age<=55 is 1, age>55 is 2
#
# note: amt_g1 is amount grouping: amt<=500 is 5, 500<amt<=1000 is 10, 1000<amt<=2000 is 20, amt>2000 is 30
#       amt_g2 is amount grouping: amt<=50 is 0.5, 50<amt<=250 is 2.5, 250<amt<=500 is 5, 500<amt<=1000 is 10,
#                                  1000<amt<=1500 is 15, 1500<amt<=2000 is 20, amt>2000 is 30
#       amt_gtext is amount grouping: amt<=500 is 5, 500<amt<=1000 is 10, amt>1000 is 30
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   instore    80000 non-null  int64  
 1   age        80000 non-null  int64  
 2   items      80000 non-null  int64  
 3   amt        80000 non-null  float64
 4   region     80000 non-null  int64  
 5   amt_g1     80000 non-null  int64  
 6   amt_g2     80000 non-null  float64
 7   amt_gtext  80000 non-null  int64  
 8   age_g1     80000 non-null  int64  
 9   age_g2     80000 non-null  int64  
 10  age_gtext  80000 non-null  int64  
 11  age_g3     80000 non-null  int64  
dtypes: float64(2), int64(10)
memory usage: 7.3 MB
In [3]:
#check for missing values
display(df.isna().any())

#drop any missing values
# df = df.dropna()
instore      False
age          False
items        False
amt          False
region       False
amt_g1       False
amt_g2       False
amt_gtext    False
age_g1       False
age_g2       False
age_gtext    False
age_g3       False
dtype: bool
In [4]:
# Checking for duplicate rows. Remove them before applying Machine learning algorithm
df = df.drop_duplicates()
df
Out[4]:
instore age items amt region amt_g1 amt_g2 amt_gtext age_g1 age_g2 age_gtext age_g3
0 0 37 4 281.03 2 5 5.0 5 3 2 2 1
1 0 35 2 219.51 2 5 2.5 5 3 2 2 1
2 1 45 3 1525.70 4 20 20.0 30 4 2 2 1
3 1 46 3 715.25 3 10 10.0 10 4 2 2 1
4 1 33 4 1937.50 1 20 20.0 30 2 2 2 1
... ... ... ... ... ... ... ... ... ... ... ... ...
79995 1 71 3 558.82 1 10 10.0 10 6 4 3 2
79996 0 59 7 1932.00 3 20 20.0 30 5 3 3 2
79997 0 54 1 414.16 2 5 5.0 5 4 3 2 1
79998 1 49 4 335.32 1 5 5.0 5 4 2 2 1
79999 1 30 1 527.12 3 10 10.0 10 2 2 2 1

79979 rows Γ— 12 columns

Create a Statistical SummaryΒΆ

In [5]:
# use describe() method to create a statistical summary to help describe the dataset
# we want to see all the data falls within 3 standard deviations from the mean, checking for outliers
df.describe()
Out[5]:
instore age items amt region amt_g1 amt_g2 amt_gtext age_g1 age_g2 age_gtext age_g3
count 79979.000000 79979.000000 79979.000000 79979.000000 79979.000000 79979.000000 79979.000000 79979.000000 79979.000000 79979.000000 79979.000000 79979.000000
mean 0.500006 45.758512 4.505133 835.825727 2.674915 11.543155 10.419991 13.699846 3.623651 2.442966 2.134973 1.262819
std 0.500003 15.715158 2.061250 721.263650 1.126642 8.028241 8.130217 10.855713 1.558081 0.808014 0.610288 0.440168
min 0.000000 18.000000 1.000000 5.004700 1.000000 5.000000 0.500000 5.000000 1.000000 1.000000 1.000000 1.000000
25% 0.000000 33.000000 3.000000 285.120000 2.000000 5.000000 5.000000 5.000000 2.000000 2.000000 2.000000 1.000000
50% 1.000000 45.000000 4.000000 582.140000 3.000000 10.000000 10.000000 10.000000 4.000000 2.000000 2.000000 1.000000
75% 1.000000 56.000000 6.000000 1233.400000 4.000000 20.000000 15.000000 30.000000 5.000000 3.000000 3.000000 2.000000
max 1.000000 85.000000 8.000000 3000.000000 4.000000 30.000000 30.000000 30.000000 7.000000 4.000000 3.000000 2.000000
In [6]:
#create a reduced columns based on original dataset
original_columns = ['instore','age','items','amt','region']
df_reduced = df[original_columns] 
df_reduced
Out[6]:
instore age items amt region
0 0 37 4 281.03 2
1 0 35 2 219.51 2
2 1 45 3 1525.70 4
3 1 46 3 715.25 3
4 1 33 4 1937.50 1
... ... ... ... ... ...
79995 1 71 3 558.82 1
79996 0 59 7 1932.00 3
79997 0 54 1 414.16 2
79998 1 49 4 335.32 1
79999 1 30 1 527.12 3

79979 rows Γ— 5 columns

Explore the DataΒΆ

In [7]:
# Build a Scatter Plot X & y data
#X = df['amt']
#y = df['age']

%matplotlib inline

# create the scatter plot
sns.pairplot(df_reduced, hue='region', size=1.5);

#make sure it's formatted
#plt.title('amt vs age')
#plt.xlablel("Amount")
#plt.ylabel("Age")
#plt.legel()
#plt.show
C:\Users\cthian972\Anaconda3\envs\test6\lib\site-packages\seaborn\axisgrid.py:2079: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
C:\Users\cthian972\Anaconda3\envs\test6\lib\site-packages\seaborn\distributions.py:288: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)
C:\Users\cthian972\Anaconda3\envs\test6\lib\site-packages\seaborn\distributions.py:288: UserWarning: Data must have variance to compute a kernel density estimate.
  warnings.warn(msg, UserWarning)

from the plots, we can see that bigger age span online than in-store, both online and instore sold no more than 8 items, online sales have greater amount span compare to in-store. region 2 have older buyers (both height and width), region4 have high spenders, the older folks in region spends less than 1000, Region 2 older than 75years old spent less than 500. Most people buy 2 to 7 items, less proportion of people buying just 1 or 8 items. in the last plot, we can see region 2 are highly contentrated from abt 0 to 500. 2 levels for the other regions: 1000 is a breaking point for R1 and R3, and 2000 a breaking point for R4.

Is it true that customers who shop in the store are older than customers who shop online?ΒΆ

In [9]:
# Create boxplot by instore
sns.set_style("whitegrid")
sns.boxplot(y = 'age', x = 'instore', data = df)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e41b82eb48>

Looking at the boxplots, the online have bigger age span, and the median is higher than in-store's median. With big overlap, we cannot conclude that the customers shop in-store are older than online shoppers.

In [10]:
# split amount data by instore
df_instore = df_reduced[df.instore > 0.5]
df_online = df_reduced[df.instore < 0.5]
#summary data for instore
df_instore.describe()
Out[10]:
instore age items amt region
count 39990.0 39990.000000 39990.000000 39990.000000 39990.000000
mean 1.0 42.958440 4.497099 774.106421 2.524881
std 0.0 13.877687 2.056026 508.118520 1.303586
min 1.0 19.000000 1.000000 50.050000 1.000000
25% 1.0 32.000000 3.000000 368.622500 1.000000
50% 1.0 42.000000 4.000000 688.180000 3.000000
75% 1.0 52.000000 6.000000 1024.400000 4.000000
max 1.0 74.000000 8.000000 1999.900000 4.000000
In [11]:
#summary data for online
df_online.describe()
Out[11]:
instore age items amt region
count 39989.0 39989.000000 39989.000000 39989.000000 39989.000000
mean 0.0 48.558654 4.513166 897.546576 2.824952
std 0.0 16.901714 2.066456 880.146928 0.891238
min 0.0 18.000000 1.000000 5.004700 2.000000
25% 0.0 35.000000 3.000000 228.150000 2.000000
50% 0.0 48.000000 5.000000 441.890000 3.000000
75% 0.0 60.000000 6.000000 1542.700000 4.000000
max 0.0 85.000000 8.000000 3000.000000 4.000000
In [12]:
# ony consider region 3 & 4 for they have both online and instore
df_3_4 = df_reduced[df.region > 2]
sns.set_style("whitegrid")
sns.boxplot(y = 'age', x = 'instore', data = df_3_4)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e41b8b8588>

Focusing on regions 3 and 4, the online buyers do not have buyers over 63years old. However, because of the IQRs are so close, we cannot say that there is any significant age different in the 2 groups

In [13]:
# split amount data by instore and region 3 or 4
df_instore_3_4 = df_3_4[df.instore > 0.5]
df_online_3_4 = df_3_4[df.instore < 0.5]
#summary data for instore
df_instore_3_4.describe()
C:\Users\cthian972\Anaconda3\envs\test6\lib\site-packages\ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  
C:\Users\cthian972\Anaconda3\envs\test6\lib\site-packages\ipykernel_launcher.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  This is separate from the ipykernel package so we can avoid doing imports until
Out[13]:
instore age items amt region
count 23993.0 23993.000000 23993.000000 23993.000000 23993.000000
mean 1.0 42.461260 4.488309 793.518765 3.541575
std 0.0 13.715151 2.059762 517.060398 0.498279
min 1.0 19.000000 1.000000 50.132000 3.000000
25% 1.0 31.000000 3.000000 377.140000 3.000000
50% 1.0 42.000000 4.000000 704.860000 4.000000
75% 1.0 52.000000 6.000000 1088.100000 4.000000
max 1.0 74.000000 8.000000 1999.900000 4.000000
In [14]:
#summary data for online
df_online_3_4.describe()
Out[14]:
instore age items amt region
count 19995.0 19995.000000 19995.000000 19995.000000 19995.000000
mean 0.0 40.508627 4.513528 1542.963129 3.649862
std 0.0 12.971100 2.067495 834.130672 0.477025
min 0.0 18.000000 1.000000 100.230000 3.000000
25% 0.0 29.000000 3.000000 819.045000 3.000000
50% 0.0 40.000000 4.000000 1542.700000 4.000000
75% 0.0 52.000000 6.000000 2256.550000 4.000000
max 0.0 63.000000 8.000000 3000.000000 4.000000

Measure the Linear CorrelationsΒΆ

In [15]:
# Strength of correlations:
#   Very strong relationship  (|r| >= 0.8)
#   Strong relationship       (|r| >= 0.6)
#   Moderate relationship     (|r| >= 0.4)
#   Weak relationship         (|r| >= 0.2)
#   Very weak relationship    (0< |r|< 0.2)
# Create Pearson correlation matrix (corr_mat) to identify which features are correlated 
# and which will have more impact on the target column than some others
corr_mat = df_reduced.corr()
print(corr_mat)
          instore       age     items       amt    region
instore  1.000000 -0.178180 -0.003897 -0.085573 -0.133171
age     -0.178180  1.000000  0.000657 -0.282033 -0.235370
items   -0.003897  0.000657  1.000000  0.000384 -0.001904
amt     -0.085573 -0.282033  0.000384  1.000000  0.403486
region  -0.133171 -0.235370 -0.001904  0.403486  1.000000

age and amount, region have weak negative correlation,age and instore have very weak correlation, age and items are not correlated

In [16]:
# Create Pearson correlation matrix visualization, and limit to 2 decimal places:
corr_mat.style.background_gradient(cmap='coolwarm').set_precision(2)
Out[16]:
instore age items amt region
instore 1.00 -0.18 -0.00 -0.09 -0.13
age -0.18 1.00 0.00 -0.28 -0.24
items -0.00 0.00 1.00 0.00 -0.00
amt -0.09 -0.28 0.00 1.00 0.40
region -0.13 -0.24 -0.00 0.40 1.00
In [17]:
# See the number of transaction by the type of shopping
df["instore"].value_counts()
Out[17]:
1    39990
0    39989
Name: instore, dtype: int64
In [18]:
# split amount data by the regions 1,2,3,4
df1 = df[df.region < 2] # region 1
df2 = df[(df.region > 1) & (df.region < 3)] # region 2
df3 = df[(df.region > 2) & (df.region < 4) ] # region 3
df4 = df[df.region > 3]# region 4
df3_0 = df[(df.region > 2) & (df.region < 4) & (df.instore < 1)] # region 3 online
df3_1 = df[(df.region > 2) & (df.region < 4) & (df.instore > 0)] # region 3 in-store
df4_0 = df[(df.region > 3) & (df.instore < 1)] # region 4 online
df4_1 = df[(df.region > 3) & (df.instore > 0)] # region 4 in-store
In [19]:
# Pearson Correlation matrix for region 1
new_columns = ['instore','age','items','amt']
df1_reduced = df1[new_columns] 
corr_mat1 = df1_reduced.corr()
corr_mat1.style.background_gradient(cmap='coolwarm').set_precision(2)
C:\Users\cthian972\Anaconda3\envs\test6\lib\site-packages\pandas\io\formats\style.py:1089: RuntimeWarning: All-NaN slice encountered
  smin = np.nanmin(s.to_numpy()) if vmin is None else vmin
C:\Users\cthian972\Anaconda3\envs\test6\lib\site-packages\pandas\io\formats\style.py:1090: RuntimeWarning: All-NaN slice encountered
  smax = np.nanmax(s.to_numpy()) if vmax is None else vmax
C:\Users\cthian972\Anaconda3\envs\test6\lib\site-packages\matplotlib\colors.py:527: RuntimeWarning: invalid value encountered in less
  xa[xa < 0] = -1
Out[19]:
instore age items amt
instore nan nan nan nan
age nan 1.00 0.01 -0.22
items nan 0.01 1.00 -0.01
amt nan -0.22 -0.01 1.00

age and amount have weak negative correlation, age and items are not correlated

In [20]:
# Pearson Correlation matrix for region 2
df2_reduced = df2[new_columns] 
corr_mat2 = df2_reduced.corr()
corr_mat2.style.background_gradient(cmap='coolwarm').set_precision(2)
Out[20]:
instore age items amt
instore nan nan nan nan
age nan 1.00 0.00 0.01
items nan 0.00 1.00 -0.00
amt nan 0.01 -0.00 1.00

age and items, age and amount are not correlated

In [21]:
# Pearson Correlation matrix for region 3
df3_reduced = df3[new_columns] 
corr_mat3 = df3_reduced.corr()
corr_mat3.style.background_gradient(cmap='coolwarm').set_precision(2)
Out[21]:
instore age items amt
instore 1.00 0.29 -0.00 -0.66
age 0.29 1.00 0.00 -0.19
items -0.00 0.00 1.00 -0.00
amt -0.66 -0.19 -0.00 1.00

amount is (moderate) negatively correlated with instore

age is weakly correlated with instore, and amount. no correlation with items

In [22]:
# Pearson Correlation matrix for region 3 online
newer_columns = ['age','items','amt']
df3_0_reduced = df3_0[newer_columns] 
corr_mat3_0 = df3_0_reduced.corr()
corr_mat3_0.style.background_gradient(cmap='coolwarm').set_precision(2)
Out[22]:
age items amt
age 1.00 -0.00 -0.00
items -0.00 1.00 -0.00
amt -0.00 -0.00 1.00

no correlation

In [23]:
# Pearson Correlation matrix for region 3 in-store
df3_1_reduced = df3_1[newer_columns] 
corr_mat3_1 = df3_1_reduced.corr()
corr_mat3_1.style.background_gradient(cmap='coolwarm').set_precision(2)
Out[23]:
age items amt
age 1.00 0.00 -0.01
items 0.00 1.00 -0.01
amt -0.01 -0.01 1.00

no linear correlation between items and age. Very weak negative correlation between amt and age.

In [24]:
# Pearson Correlation matrix for region 4
df4_reduced = df4[new_columns] 
corr_mat4 = df4_reduced.corr()
corr_mat4.style.background_gradient(cmap='coolwarm').set_precision(2)
Out[24]:
instore age items amt
instore 1.00 -0.15 -0.01 -0.34
age -0.15 1.00 -0.01 0.05
items -0.01 -0.01 1.00 0.01
amt -0.34 0.05 0.01 1.00

instore and amount have weak negative correlation, age and amt have very weak postitive correlatiob, age and instore have very weak correlation. items is not correlated with instore, age,or amount

In [25]:
# Pearson Correlation matrix for region 4 online
df4_0_reduced = df4_0[newer_columns] 
corr_mat4_0 = df4_0_reduced.corr()
corr_mat4_0.style.background_gradient(cmap='coolwarm').set_precision(2)
Out[25]:
age items amt
age 1.00 -0.00 -0.01
items -0.00 1.00 0.01
amt -0.01 0.01 1.00

no correlation

In [26]:
# Pearson Correlation matrix for region 4 in-store
# Pearson Correlation matrix for region 3 in-store
df4_1_reduced = df4_1[newer_columns] 
corr_mat4_1 = df4_1_reduced.corr()
corr_mat4_1.style.background_gradient(cmap='coolwarm').set_precision(2)
Out[26]:
age items amt
age 1.00 -0.01 0.00
items -0.01 1.00 0.00
amt 0.00 0.00 1.00

no linear correlation

Simple Linear RegressionΒΆ

In [ ]:
#import the linear regression class:
from sklearn.linear_model import LinearRegression

#nstantiate LinearRegression class and specify that we would like to fit the intercept using fit_intercept hyperparameter:
model = LinearRegression(fit_intercept=True)
model
In [ ]:
# Arrange data into a features matrix and target 
# we want to look at region 1 only, because the correlation matrix show some weak correlation between age and amt
X_regr = df[['age'],['instore']]
X_regr.shape
In [ ]:
#identify target array, y
y = df1[['amt']] 
y.shape
In [ ]:
# Use fit() method to fit the model
# This fit() command causes a number of model-dependent internal computations to take place, and the results of these 
# computations are stored in model-specific attributes that the user can explore. 

model.fit(X_regr, y)
In [ ]:
# In Scikit-Learn, by convention all model parameters that were learned during the fit() process have trailing underscores
# The coefficients 
model.coef_
In [ ]:
# y-intercept
model.intercept_

For Region 1, the fitted model is amt = 1080.24 - 7.67age

In [ ]:
# Once the model is trained, the main task of supervised machine learning is to evaluate it based on what it says about new
# data that was not part of the training set. In Scikit-Learn, this can be done using the predict() method. 
# For the sake of this example, our "new data" will be a grid of x values, and we will ask what y values the model predicts:
# coerce these x values into a [n_samples, n_features]
xfit = np.linspace(18, 64)
In [ ]:
Xfit = xfit[:, np.newaxis]
yfit = model.predict(Xfit)
In [ ]:
# Visualize the results by plotting first the raw data, and then this model fit:
# sns.set_style('whitegrid') 
# sns.lmplot(xfit, yfit, data = df1 )

Cross Validation and Choosing the Right ModelΒΆ

In [27]:
# Leverage the built in machine learning Models in the libraries
from sklearn.tree import DecisionTreeClassifier # A Decision Tree Classifier
from sklearn.model_selection import train_test_split # Split arrays/matrices into random train and test subsets
from sklearn import linear_model # Import linear 
#from sklearn.linear_model import LinearRegression # Import linear model
from sklearn.model_selection import cross_val_score # Import cross_val_score function
from sklearn.metrics import accuracy_score #Accuracy classification score.(best performance =1)
from sklearn.metrics import confusion_matrix # describe performance of a classification model, visualization algorithm performance
from sklearn.metrics import classification_report #A text report shows main classification metrics
from sklearn.ensemble import RandomForestClassifier # A random forest is a meta estimator that fits a number of 
# decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive 
# accuracy and control over-fitting. The sub-sample size is always the same as the original input sample size 
# but  the samples aredrawn with replacement if bootstrap=True (default).
from sklearn.ensemble import GradientBoostingClassifier #GB builds an additive model in a forward stage-wise 
# fashion allows fowr the optimization of arbitrary differentiable loss functions. In each stage 
# n_classes_ regression trees are fit on the negative gradient of the binomial or multinomial deviance 
# loss function. Binary classification is a special case where only a single regression tree is induced.
from sklearn.tree import export_graphviz # Export decision tree in DOT format
In [28]:
# Slicing using the [ ] operator selects a set of rows and/or columns from a DataFrame
# To slice out a set of rows, use the following syntax: data[start:stop]. 
# To select any given column, we can use Pandas to select a column by its name
# variable_name = dataframe['columnName']

# split dataset in Features and Target variable
# select features to create feature matrix {n samples, m columns|
# The features (columns) always refer to the distinct observations that describe each sample in a quantitative manner. 
# Features are generally real-valued, but may be Boolean or discrete-valued in some cases.
# X = df.iloc[:,0:4]
feature_cols = ['instore','amt','region','items']
X = df[feature_cols] #Features
print('Summary of feature sample')
X.head()
Summary of feature sample
Out[28]:
instore amt region items
0 0 281.03 2 4
1 0 219.51 2 2
2 1 1525.70 4 3
3 1 715.25 3 3
4 1 1937.50 1 4
In [29]:
# Select the Dependent Variable or Target Array
# Target array by convention we will usually call y. The target array is usually one dimensional, with length n_samples, 
# and is generally contained in a NumPy array or Pandas Series. The target array may have continuous numerical values, or 
# discrete classes/labels. While some Scikit-Learn estimators do handle multiple target values in the form of a 2-dimensional,
# [n_samples, n_targets] target array, we will primarily be working with the common case of a one-dimensional target array.
# (Hint: Investigate the Relationship Between the Region of Purchase and a Customer's Age):
y = df.age 

Compare two classification AlgorithmsΒΆ

In order to do this efficiently, create a copy and store each instance we've already imported in a list. Create an empty list and append it with both algorithms as follows:

In [30]:
algos_Class = [] # empty list
algos_Class.append(('Random Forest Classifier = ', RandomForestClassifier()))
algos_Class.append(('Decision Tree Classifier = ', DecisionTreeClassifier()))

To build and assess both models, we create an empty list to store the results and another to hold the name of each algorithm so we can easily print out the results and keep them separated as follows:

In [31]:
# classification
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X, y, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [32]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.020892975034898675
Decision Tree Classifier =  0.021180546458869864

only 2% accuracy. Very bad

In [33]:
# use age_g1 for target
y1 = df.age_g1 
In [34]:
# classification 3 
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X, y1, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [35]:
 for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.20714184228941065
Decision Tree Classifier =  0.20640416115131974

slight improvement to the accuracy scores to 21%

In [36]:
# replace amt by amt_g1 from features
feature_cols = ['instore','region','amt_g1']
X1 = df[feature_cols] #Features
In [37]:
# classification 4 
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X1, y1, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [38]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.23751235081736413
Decision Tree Classifier =  0.23766238832674147

slight improvement to the accuracy scores to 24%

In [39]:
# replace amt_g1 by amt_g2 from features
feature_cols = ['instore','region','amt_g2']
X2 = df[feature_cols] #Feature
In [40]:
# classification 5
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X2, y1, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [41]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.23818751960956216
Decision Tree Classifier =  0.23793745709393324

insignificant improvement

In [42]:
# use age_g2
y2 = df.age_g2
In [43]:
# classification 6
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X2, y2, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [44]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.5335400544310652
Decision Tree Classifier =  0.5335400544310652

significant improvement (less target bins seems to improve predictions) to 53%

In [45]:
# use age_gtext
y3 = df.age_gtext
In [46]:
# classification 7
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X2, y3, cv = 5, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [47]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.6192125297657645
Decision Tree Classifier =  0.6192125297657645

reduce the target to 3 bins, improve the predictions to 62%

In [48]:
# classification 8
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X1, y3, cv = 5, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [49]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.6192125297657645
Decision Tree Classifier =  0.6192125297657645

Fixed y3, changed x with amt_g1, amt_g2, amt_gtext, same results. That is to improve the prediction, we will have to reduce target bins to 2.

In [50]:
# use age_g3
y4 = df.age_g3
In [51]:
# classification 8
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X2, y4, cv = 5, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [52]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.7470585784927007
Decision Tree Classifier =  0.7470585784927007

accuracy improved to 75%

Assessing PerformanceΒΆ

Now that cross validation has been used for choosing a model, it is time to proceed with training the model and assessing its performance. Since this is a classification problem we will specify three different performance metrics to assess the performance of the model and the accuracy of its predictions. The three metrics are Accuracy, Weighted Mean Recall, Weighted Mean Precision

Train the modelΒΆ

In [53]:
# Step 1:
# Train/Test Split: splitting data into 70% for training and 30% for testing and ensuring the data
# is randomly stratified to prevent and bias from occurring due to any ordering of the data.
X_train, X_test, y_train, y_test = train_test_split(X2, y4, test_size = .30, random_state = 123)
In [54]:
# Step 2:
# instantiate the algorithm just as we did previously, and fit or apply the algorithm
# to our training sets to build the model as follows:
# Modeling (Classification)
algo = DecisionTreeClassifier()
model = algo.fit(X_train,y_train)
In [55]:
# Step 3:
# Use the fully trained model on the testing set to make predictions before it can be assessed:
# Predictions
preds = model.predict(X_test)
In [56]:
# Step 4:
# Print out the classification report using the predictions just made and the ground truth or 
# the actual values from the testing set to assess the model with a given metric as follows
#
# Precision is the ability of a classiifer not to label an instance positive that is actually negative. 
# For each class it is defined as as the ratio of true positives to the sum of true and false positives. 
# Said another way, β€œfor all instances classified positive, what percent was correct?” 
#
# Recall is the ability of a classifier to find all positive instances. 
# For each class it is defined as the ratio of true positives to the sum of true positives and false negatives. 
# Said another way, β€œfor all instances that were actually positive, what percent was classified correctly?”
#
# The F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0.
# Generally speaking, F1 scores are lower than accuracy measures as they embed precision and recall into their computation. 
# As a rule of thumb, the weighted average of F1 should be used to compare classifier models, not global accuracy.
#
# Support is the number of actual occurrences of the class in the specified dataset. Imbalanced support in the training data
# may indicate structural weaknesses in the reported scores of the classifier and could indicate the need for stratified 
# sampling or rebalancing. Support doesn’t change between models but instead diagnoses the evaluation process.
#
# High recall, low precision: This means that most of the positive examples are correctly recognized (low FN) but there 
# are a lot of false positives.
#
# Low recall, high precision: This shows that we miss a lot of positive examples (high FN) but those we predict as 
# positive are indeed positive (low FP)
#
print(classification_report(y_test, preds))
              precision    recall  f1-score   support

           1       0.82      0.84      0.83     17656
           2       0.52      0.49      0.51      6338

    accuracy                           0.75     23994
   macro avg       0.67      0.66      0.67     23994
weighted avg       0.74      0.75      0.74     23994

In the classification report, for all identifed as age<= 55, 82% are correctly identified. (precision). For all instances that were actually age <=55, 84% was classified correctly (Recall). For the age > 55 group , both precision and recall are close to 50%. Which is not too good. Since there are more count in the age<=55 group, the weighted mean recall and weighted precision are close to 75%.

In [57]:
#print accuracy score
print('Accuracy Score: ',accuracy_score(y_test, preds))
Accuracy Score:  0.7460615153788447

The accuracy_score function is also incorporated in the classification report.

In [58]:
#output Confusion matrix
confusion_matrix(y_test, preds)
Out[58]:
array([[14789,  2867],
       [ 3226,  3112]], dtype=int64)
In [66]:
# Calculate Classification Rate/Accuracy:
# Classification Rate or Accuracy is given by the relation:
# accuracy = (TP + TN) /(TP + TN + FP + FN)
# However, there are problems with accuracy. It assumes equal costs for both kinds of errors.
# A 99% accuracy can be excellent, good, mediocre, poor or terrible depending upon the problem.
Acc= (14789+3226)/(14789+2867+3226+3112)
Acc
Out[66]:
0.750812703175794
In [65]:
# Calculate Recall 
# Recall = TP/(TP+TN)
Rec = 14789/(14789+3226)
Rec
Out[65]:
0.8209270052733832
In [67]:
# Step 5:
# To visualize the decision tree
from sklearn.externals.six import StringIO 
from IPython.display import Image 
from sklearn.tree import export_graphviz
import pydotplus
C:\Users\cthian972\Anaconda3\envs\test6\lib\site-packages\sklearn\externals\six.py:31: FutureWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
  "(https://pypi.org/project/six/).", FutureWarning)
In [102]:
age_values = ['<=55','>55'] # this is just a list specifying the region classes
In [103]:
dot_data = StringIO()
export_graphviz(model, out_file = dot_data, filled = True, rounded = True,
feature_names=X2.columns, class_names = region_values, label='all', precision = 1, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
#graph.write_png('tree.png')
Image(graph.create_png())
Out[103]:
In [104]:
# Repeat steps 1-4: splitting data into 75% for training and 25% for testing and ensuring the data
X_train, X_test, y_train, y_test = train_test_split(X1, y4, test_size = .25, random_state = 123)
algo = DecisionTreeClassifier()
model = algo.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
              precision    recall  f1-score   support

           1       0.82      0.84      0.83     14753
           2       0.52      0.50      0.51      5242

    accuracy                           0.75     19995
   macro avg       0.67      0.67      0.67     19995
weighted avg       0.74      0.75      0.75     19995

In [105]:
dot_data = StringIO()
export_graphviz(model, out_file = dot_data, filled = True, rounded = True,
feature_names=X1.columns, class_names = region_values, label='all', precision = 1, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
Image(graph.create_png())
Out[105]:
In [106]:
# Step2 1-4: splitting data into 80% for training and 20% for testing and ensuring the data
X_train, X_test, y_train, y_test = train_test_split(X2, y4, test_size = .20, random_state = 35)
algo = DecisionTreeClassifier()
model = algo.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
              precision    recall  f1-score   support

           1       0.82      0.84      0.83     11755
           2       0.53      0.49      0.51      4241

    accuracy                           0.75     15996
   macro avg       0.67      0.67      0.67     15996
weighted avg       0.74      0.75      0.74     15996

In [107]:
dot_data = StringIO()
export_graphviz(model, out_file = dot_data, filled = True, rounded = True,
feature_names=X2.columns, class_names = region_values, label='all', precision = 1, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
Image(graph.create_png())
Out[107]:
In [74]:
# increase max_depth
model = DecisionTreeClassifier(max_depth=10)
model.fit(X2,y4)
Out[74]:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
In [75]:
model.score(X2,y4)
Out[75]:
0.7470586028832569
In [76]:
model.predict([[0,3,10]]) #prediction
Out[76]:
array([1], dtype=int64)
In [77]:
model.predict([[1,3,30]]) #prediction
Out[77]:
array([1], dtype=int64)
In [78]:
model.predict([[0,1,5]]) #prediction
Out[78]:
array([2], dtype=int64)
In [79]:
#change "gini" to "enthropy"
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='enthropy',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='random')
Out[79]:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='enthropy',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='random')
In [80]:
model.score(X2,y4)
Out[80]:
0.7470586028832569

Predict if a customer will buy online or in our storesΒΆ

In [81]:
#Define Features and Target
feature_cols = ['age_g1','amt_g2','region']
X_instore1 = df[feature_cols] #Features
y_instore  = df.instore 
In [82]:
# classification
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X_instore1, y_instore, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [83]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.8860950866397346
Decision Tree Classifier =  0.8860950866397346

89% accuracy

In [84]:
# Repeat steps 1-4: splitting data into 75% for training and 25% for testing and ensuring the data
X_train, X_test, y_train, y_test = train_test_split(X_instore1, y_instore, test_size = .25, random_state = 123)
algo = DecisionTreeClassifier()
model = algo.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
              precision    recall  f1-score   support

           0       0.99      0.78      0.87      9870
           1       0.82      1.00      0.90     10125

    accuracy                           0.89     19995
   macro avg       0.91      0.89      0.89     19995
weighted avg       0.91      0.89      0.89     19995

In [85]:
instore_values = ['online','instore']
In [86]:
dot_data = StringIO()
export_graphviz(model, out_file = dot_data, filled = True, rounded = True,
feature_names=X_instore1.columns, class_names = instore_values, label='all', precision = 1, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
#graph.write_png('tree.png')
Image(graph.create_png())
Out[86]:
In [87]:
graph.write_png('tree_instore.png')
Out[87]:
True
In [88]:
# change to age_g3
feature_cols = ['age_g3','amt_g2','region']
X_instore2 = df[feature_cols] #Features
In [89]:
# classification
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X_instore2, y_instore, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [90]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.8827192042205758
Decision Tree Classifier =  0.8827192042205758
In [91]:
# change to age
feature_cols = ['age','amt_g2','region']
X_instore3 = df[feature_cols] #Features
In [92]:
# classification
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X_instore3, y_instore, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [93]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.8873704256365302
Decision Tree Classifier =  0.8873454193849674
In [94]:
# change to age and amt
feature_cols = ['age','amt','region']
X_instore4 = df[feature_cols] #Features
In [95]:
# classification
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X_instore4, y_instore, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [96]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.8488353084369971
Decision Tree Classifier =  0.8466472276570962

age is a better predictor compare to age grouping, but not so for amount grouping

In [97]:
# change to age and amt
feature_cols = ['age','amt_g1','region']
X_instore5 = df[feature_cols] #Features
In [98]:
# classification
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X_instore5, y_instore, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [99]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.8876830037810665
Decision Tree Classifier =  0.8876830037810665

accuracy improves slightly to 89%

In [100]:
# Repeat steps 1-4: splitting data into 75% for training and 25% for testing and ensuring the data
X_train, X_test, y_train, y_test = train_test_split(X_instore5, y_instore, test_size = .25, random_state = 123)
algo = DecisionTreeClassifier()
model = algo.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
              precision    recall  f1-score   support

           0       0.99      0.78      0.87      9870
           1       0.82      0.99      0.90     10125

    accuracy                           0.89     19995
   macro avg       0.91      0.89      0.89     19995
weighted avg       0.90      0.89      0.89     19995

In [101]:
dot_data = StringIO()
export_graphviz(model, out_file = dot_data, filled = True, rounded = True,
feature_names=X_instore5.columns, class_names = instore_values, label='all', precision = 1, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
#graph.write_png('tree.png')
Image(graph.create_png())
Out[101]:

using other demorgraphic data to determine the regionsΒΆ

In [110]:
# split dataset in Features and Target variable
# select features
# X = df.iloc[:,0:4]
feature_columns = ['instore','age','items','amt_g2']
X_region = df[feature_columns] #Features
y_region = df.region # Target variable
In [111]:
# classification
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X_region, y_region, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [112]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.6127733150512581
Decision Tree Classifier =  0.6075219609508701
In [114]:
# classification 2 try cv=5 
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X_region, y_region, cv = 5, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [115]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.6144362927243221
Decision Tree Classifier =  0.6105852643217072

compare cv=3 and cv=5, only tiny improvement to the accuracy of both random forest and decision tree classifiers

In [119]:
# drop items from features, since "items" is uncorrelated to other variables
feature_columns = ['instore','age','amt_g2']
X_region1 = df[feature_columns] #Features
y_region = df.region # Target variable
In [120]:
# classification 3 
results = []
names = []
for name, model in algos_Class:
    accuracy1 = cross_val_score(model, X_region1, y_region, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [121]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.6105852643217072
Decision Tree Classifier =  0.6105852643217072
In [122]:
# replace age by age_g2, amt_g2 by amt_g1 from features
feature_cols = ['instore','age_g1','amt_g1']
X_region2 = df[feature_cols] #Features
In [123]:
# classification 4 
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X_region2, y_region, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [124]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.6428687674750752
Decision Tree Classifier =  0.6428687674750752
In [125]:
# replace age by age_g1 from features
feature_cols = ['instore','age_g1','amt_g2']
X_region3 = df[feature_cols] #Feature
In [126]:
# classification 5
results = []
names = []
for name, model in algos_Class:
    accuracy = cross_val_score(model, X_region3, y_region, cv = 3, scoring ='accuracy')
    names.append(name)
    results.append(accuracy)
In [127]:
for i in range(len(names)):
    print(names[i], results[i].mean())
Random Forest Classifier =  0.6424561643242875
Decision Tree Classifier =  0.6424561643242875
In [128]:
#train the model
X_train, X_test, y_train, y_test = train_test_split(X_region2, y_region, test_size = .30, random_state = 123)
algo = DecisionTreeClassifier()
model = algo.fit(X_train,y_train)
In [130]:
# Predictions
preds = model.predict(X_test)
print(classification_report(y_test, preds))
              precision    recall  f1-score   support

           1       0.40      0.52      0.45      4838
           2       0.90      1.00      0.94      5979
           3       0.52      0.23      0.32      5391
           4       0.65      0.72      0.68      7786

    accuracy                           0.64     23994
   macro avg       0.62      0.62      0.60     23994
weighted avg       0.63      0.64      0.62     23994

Region 2 has high precision and high recall. Region 4 has a recall rate of 72%, but 65% of precision. Region 3 has a low recall rate. the overall accuracy, weughted mean precision and recall are in the 64% area.

In [132]:
print(cross_val_score(model, X_region3, y_region))
[0.64347337 0.64091023 0.63978495 0.64497374 0.64520163]